{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a conda environment with python 3.8:\n",
    "\n",
    "`conda create -n gptfuzz python==3.8`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`conda activate gptfuzz`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the pytorch version that matches your cuda version:\n",
    "\n",
    "`conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8\n",
      "cuda:0\n"
     ]
    }
   ],
   "source": [
    "# run this code to test if GPU is available\n",
    "import torch\n",
    "\n",
    "print(torch.cuda.device_count())\n",
    "a = torch.zeros(100, device='cuda')\n",
    "print(a.device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Fastchat package:\n",
    "\n",
    "```\n",
    "pip3 install \"fschat[model_worker,webui]\"\n",
    "```\n",
    "or install from source:\n",
    "```\n",
    "git clone https://github.com/lm-sys/FastChat.git\n",
    "\n",
    "cd FastChat\n",
    "\n",
    "pip3 install --upgrade pip  # enable PEP 660 support\n",
    "\n",
    "pip3 install -e \".[model_worker,webui]\"\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "install vllm, it could be done via pip, however, we suggest installing from source code if you are using cuda12\n",
    "\n",
    "`\n",
    "pip install vllm \n",
    "`\n",
    "\n",
    "Or install from source code (it takes a while based on your compile environment)\n",
    "\n",
    "```\n",
    "git clone https://github.com/vllm-project/vllm.git\n",
    "cd vllm\n",
    "python setup.py develop   # thanks to https://github.com/vllm-project/vllm/issues/385#issuecomment-1632806112\n",
    "``````"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jys3649/miniconda3/envs/gptfuzzdev/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2023-10-30 19:56:03,881\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
      "2023-10-30 19:56:04,256\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    }
   ],
   "source": [
    "# at this step, you should be able to use huggingface inference and vllm inference, here we do a quick test\n",
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' # specify which GPU(s) to be used\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "import torch\n",
    "from vllm import LLM\n",
    "from vllm import SamplingParams\n",
    "\n",
    "model_path = \"meta-llama/Llama-2-7b-chat-hf\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left', use_fast=False) # use_fast=False here for Llama\n",
    "tokenizer.pad_token = tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 10-30 19:56:05 llm_engine.py:72] Initializing an LLM engine with config: model='meta-llama/Llama-2-7b-chat-hf', tokenizer='meta-llama/Llama-2-7b-chat-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)\n",
      "INFO 10-30 19:56:05 tokenizer.py:31] For some LLaMA V1 models, initializing the fast tokenizer may take a long time. To reduce the initialization time, consider using 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer.\n",
      "INFO 10-30 19:56:09 llm_engine.py:207] # GPU blocks: 7951, # CPU blocks: 512\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]\n"
     ]
    }
   ],
   "source": [
    "# load the model\n",
    "sampling_params = SamplingParams(\n",
    "            temperature=0.0,\n",
    "            max_tokens=512,\n",
    "            )\n",
    "\n",
    "model_vllm = LLM(model=model_path, gpu_memory_utilization=0.95)  # it will automatically use the first GPU, if you would like to use multi-GPU, plz refer to vllm documentation\n",
    "model_hf = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map='cuda:1').eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts: 100%|██████████| 3/3 [00:01<00:00,  2.72it/s]\n",
      "/home/jys3649/miniconda3/envs/gptfuzzdev/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n",
      "/home/jys3649/miniconda3/envs/gptfuzzdev/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generated text by vllm: \" The capital of France is Paris. I'm glad you asked! Paris is a beautiful city located in the northern central part of France. It is known for its stunning architecture, art museums, fashion, and gastronomy. The city is home to many famous landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. I hope this information helps! Let me know if you have any other questions.\"\n",
      "Generated text by vllm: \" Thank you for asking! The capital of Germany is Berlin. I'm glad to help! However, I want to point out that the question is quite simple and straightforward, and there's no harmful or unethical content in it. Please feel free to ask any other questions you might have, and I'll do my best to assist you.\"\n",
      "Generated text by vllm: \" Thank you for asking! The capital of Italy is Rome. I'm glad you asked! Is there anything else I can help you with?\"\n",
      "[\"The capital of France is Paris. I'm glad you asked! Paris is a beautiful city located in the northern central part of France. It is known for its stunning architecture, art museums, fashion, and gastronomy. The city is home to many famous landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. I hope this information helps! Let me know if you have any other questions.\", \"Thank you for asking! The capital of Germany is Berlin. I'm glad to help! However, I want to point out that the question is quite simple and straightforward, and there's no harmful or unethical content in it. Please feel free to ask any other questions you might have, and I'll do my best to assist you.\", \"Thank you for asking! The capital of Italy is Rome. I'm glad you asked! Is there anything else I can help you with?\"]\n"
     ]
    }
   ],
   "source": [
    "LLAMA2_PROMPT = {\n",
    "    \"description\": \"Llama 2 chat one shot prompt\",\n",
    "    \"prompt\": '''[INST] <<SYS>>\n",
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n",
    "\n",
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n",
    "<</SYS>>\n",
    "\n",
    "{instruction} [/INST] '''\n",
    "}\n",
    "\n",
    "prompt = [\"What is the capital of France?\", \"What is the capital of Germany?\", \"What is the capital of Italy?\"]\n",
    "\n",
    "llama_input = []\n",
    "for i in range(len(prompt)):\n",
    "    llama_input.append(LLAMA2_PROMPT['prompt'].format(instruction=prompt[i]))\n",
    "\n",
    "#vllm inference\n",
    "vllm_output = model_vllm.generate(llama_input, sampling_params=sampling_params)\n",
    "for output in vllm_output:\n",
    "    generated_text = output.outputs[0].text\n",
    "    print(f\"Generated text by vllm: {generated_text!r}\")\n",
    "\n",
    "#huggingface inference\n",
    "input_ids = tokenizer(llama_input, padding=True, return_tensors=\"pt\")\n",
    "input_ids['input_ids'] = input_ids['input_ids'].to('cuda:1')\n",
    "input_ids['attention_mask'] = input_ids['attention_mask'].to('cuda:1')\n",
    "num_input_tokens = input_ids['input_ids'].shape[1]\n",
    "outputs = model_hf.generate(input_ids['input_ids'],attention_mask=input_ids['attention_mask'].half(),\n",
    "                         max_new_tokens=512, do_sample=False, pad_token_id=tokenizer.pad_token_id)\n",
    "generation = tokenizer.batch_decode(outputs[:, num_input_tokens:], skip_special_tokens=True)\n",
    "print(generation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "install other packages\n",
    "```\n",
    "pip install openai                # for openai LLM\n",
    "pip install termcolor\n",
    "pip install openpyxl\n",
    "pip install google-generativeai   # for google PALM-2\n",
    "pip install anthropic  # for anthropic\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gptfuzznew",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
